Skunkware 5

home *** CD-ROM | disk | FTP | other *** search

/ Skunkware 5 / Skunkware 5.iso / src / X11 / wais / ir / sighash.c < prev next >

Wrap

C/C++ Source or Header | 1995-05-09 | 10KB | 328 lines

/* WIDE AREA INFORMATION SERVER SOFTWARE: No guarantees or restrictions. See the readme file for the full standard disclaimer. Brewster@think.com */ /* The memory hashtables for building an index. */ /* -brewster 5/90 */ /* main functions: * add_word * finished_add_word * look_up_word * * The idea is to store up a bunch of words before going to disk. * A word entry points to where it will go on disk, and * accumulates the entries before doing it. * * Some of the policy issues in this file are: * How much weight should the first occurance of a word in a document get * over the other occurances. The first occurance should be worth more * so that words with 3 occurances of "dog" and not "cat"'s should not * win out over 1 "dog" and 1 "cat" if the question is "Tell me about cats * torture dogs" * The extra weight is 5 at this point. * */ #ifndef lint static char *RCSid = "$Header: /proj/wais/wais-8-b5/ir/RCS/sighash.c,v 1.23 92/05/05 13:21:25 shen Exp Locker: shen $"; #endif /* Change log: * $Log: sighash.c,v $ * Revision 1.23 92/05/05 13:21:25 shen * undo the change make in previous revision 1.22 * fixed the bug in db->total_word_ccount by checking the last word in t * table. If the last word is DICTIONARY_TOTAL_SIZE_WORD, decrement the number * of words to add to dictionary file by 1. * * Revision 1.22 92/05/05 11:01:54 shen * fixed db->total_word_count for incremental update by eliminating adding * DICTIONARY_TOTAL_SIZE_WORD to hash table. * * Revision 1.21 92/04/29 08:21:15 shen * redefine MAX_OCCURANCES to a very big number: 0x10000000. * * Revision 1.20 92/03/20 11:04:18 jonathan * Added word_postition argument to add word. See irext.h for explanation. * * Revision 1.19 92/03/01 16:11:57 brewster * took out analyze_hashtable_distribution * * Revision 1.18 92/02/24 19:58:04 jonathan * Added code to add the dictionary to the hastable on startup. * * Revision 1.17 92/02/12 13:46:18 jonathan * Added "$Log" so RCS will put the log message in the header * */ /* To Do: * done: Improve the hashing functions. * done: stop inserting into hash table after max number have been accumulated * done: make flush not flush buffers that are too big. */ #include <ctype.h> #include <string.h> /* for strlen(), memset() */ #include "panic.h" #include "cutil.h" #include "futil.h" #include "irfiles.h" #include "irhash.h" #include "stoplist.h" #include "irinv.h" #include "sigindex.h" #ifdef UNIX #define PRINT_AS_INDEXING true /* also defined in irtfiles.c and irfiles.c */ #else #define PRINT_AS_INDEXING false #endif #define PROXIMITY /* this turns on writing out of all word occurances */ /* ---------------------------------------------------- */ static hash_entry* look_up_word _AP((char* word,hashtable* the_word_memory_hashtable)); static hash_entry* look_up_word(word,the_word_memory_hashtable) char* word; hashtable* the_word_memory_hashtable; { hash_entry * answer = get_hash(word, the_word_memory_hashtable); if(NULL != answer) return(answer); else{ hash_entry wrd_entry; answer = put_hash(word, the_word_memory_hashtable, &wrd_entry); answer->number_of_occurances = 0; answer->memory_ptr = NULL; answer->memory_size = 0; answer->current_memory_ptr = answer->memory_ptr; answer->current_doc_id = 0; return(answer); } } #ifdef NOTUSED static unsigned char add_weight _AP((long current_weight,long new_weight)); static unsigned char add_weight(current_weight,new_weight) long current_weight; long new_weight; /* add a new weight to the existing one */ { /* this should be smarter than this, like doing the log or something */ if(127 < (current_weight + new_weight)){ /* the max char. should be 255, but does not work on all compilers */ return(127); } else{ return(current_weight + new_weight); } } long write_bytes_to_memory(value,size,ptr) long value; long size; unsigned char* ptr; { /* writes the number into memory lsb first. returns the number of bytes written */ long i; long original_value = value; if(size < 0) /* paranoia */ panic("attempting to write a negative number of bytes"); ptr += size; /* start at the end of the block and write backwards */ for (i = 0; i < size; i++){ ptr--; *ptr = (unsigned char)(value & 0xFF); value = value >> 8; } if(value != 0) panic("In a call to write_bytes_to_memory, the value %ld can not be represented in %ld bytes", original_value, size); return(size); } #endif /* def NOTUSED */ /* adds a word to the hashtable. * Returns the 0 if successful. See irext.h for more documentation. */ long add_word(word, char_pos, line_pos, weight, doc_id, date, word_pair, db, word_position) char *word; /* the word to be indexed, this could be a word pair. If NULL there are no more words to be indexed */ long char_pos; /* the position of the start of the word */ long line_pos; /* this is passed for the best section calculation */ long weight; /* how important the word looks syntactically (such as is it bold) NOT used by signature system */ long doc_id; /* current document, this will never be 0 */ time_t date; /* display day of this document, 0 if not known */ long word_pair; database* db; /* database to insert the document */ boolean word_position; /* ignored here. */ { /* look up the word in the hashtable */ /* creates it if necessary */ hash_entry* wrd_entry; hashtable * the_word_memory_hashtable = db->the_word_memory_hashtable; /* printf("Word: '%s' doc_id: %ld, pos: %ld, weight: %ld\n", word, doc_id, char_pos, weight); */ if(NULL == db->the_word_memory_hashtable){ panic("The memory word hashtable is not defined."); } /* if we have indexed enough words flush the memory copies to disk. if(db->number_of_words_in_hashtable == db->flush_after_n_words) flush_memory_hashtable_to_disk(db, false); ** not done on sig system ** */ wrd_entry = look_up_word(word, the_word_memory_hashtable); wrd_entry->number_of_occurances ++; /* check if we have too many of this word before we add it */ #undef MAX_OCCURANCES #define MAX_OCCURANCES 0x10000000 if(wrd_entry->number_of_occurances < MAX_OCCURANCES){ db->number_of_words_in_hashtable ++; sig_add_word(word, char_pos, line_pos, weight, doc_id, date, word_pair); } return(0L); } void add_stop_words(the_word_memory_hashtable) hashtable *the_word_memory_hashtable; /* add the stop words to the hashtable. this must be done before adding other words */ { init_stop_list(); while(true){ char *word = next_stop_word(); hash_entry* wrd_entry; if(NULL == word) break; wrd_entry = look_up_word(word, the_word_memory_hashtable); wrd_entry->number_of_occurances = STOP_WORD_FLAG; } } long finished_add_word(db) database *db; { /* write out the dictioanry */ long i; long num_words; db->number_of_words = hashtable_count(db->the_word_memory_hashtable); init_dict_file_for_writing(db); /* analyze_hashtable_distribution(db->the_word_memory_hashtable); */ sort_hashtable(db->the_word_memory_hashtable); /* exclude the last word which is DICTIONARY_TOTAL_SIZE_WORD */ num_words = hashtable_count(db->the_word_memory_hashtable); if ( 0 == strcmp(db->the_word_memory_hashtable->contents[num_words-1].key, DICTIONARY_TOTAL_SIZE_WORD) ) num_words--; for(i = 0; i < num_words; i++){ hash_entry * entry = &db->the_word_memory_hashtable->contents[i]; if(0 == (STOP_WORD_FLAG & entry->number_of_occurances)){ /* write out the dictionary entry */ /* printf("Adding word: %s %ld entries\n", entry->word, entry->number_of_occurances); */ /* the position in this world is a unique id for every word */ add_word_to_dictionary(entry->key, i, entry->number_of_occurances, db); } } finished_add_word_to_dictionary(db); return(sig_finished_add_word(db)); } /* Add the dictionary to the hastable */ void add_dictionary_to_hashtable(db) database *db; { /* prints the contents of a dictionary */ FILE *stream = db->dictionary_stream; long i, j, new_number_of_dictionary_blocks; extern unsigned char *dictionary_header_block, *dictionary_block; extern long number_of_dictionary_blocks; if(stream != NULL) { waislog(WLOG_LOW, WLOG_INFO, "Adding dictionary to hastable"); s_fseek(stream, 0L, SEEK_SET); new_number_of_dictionary_blocks = read_bytes(DICTIONARY_HEADER_SIZE, stream); if(new_number_of_dictionary_blocks > number_of_dictionary_blocks) dictionary_header_block = NULL; number_of_dictionary_blocks = new_number_of_dictionary_blocks; if(NULL == (dictionary_header_block = read_dictionary_block(dictionary_header_block, DICTIONARY_HEADER_SIZE, number_of_dictionary_blocks, stream))) { waislog(WLOG_MEDIUM, WLOG_WARNING, "Could not read dictionary header block"); return; } for(i = 0; i < number_of_dictionary_blocks; i++){ long pos = dictionary_block_position(i, dictionary_header_block); if(NULL == (dictionary_block = read_dictionary_block(dictionary_block, pos, DICTIONARY_BLOCK_SIZE, stream))) { waislog(WLOG_MEDIUM, WLOG_WARNING, "Could not read dictionary block %ld", pos); } else /* iterate over words */ for(j = 0; j < DICTIONARY_BLOCK_SIZE; j++) { char *word = dictionary_block_word(j, dictionary_block); hash_entry* wrd_entry; if(word[0] == '\0' ) break; wrd_entry = look_up_word(word, db->the_word_memory_hashtable); wrd_entry->number_of_occurances = dictionary_block_word_occurances(j, dictionary_block); } } } } long init_add_word(db, hashtable_size, flush_after_n_words) database *db; long hashtable_size; long flush_after_n_words; { if(NULL != db->the_word_memory_hashtable) free_hashtable(db->the_word_memory_hashtable); db->the_word_memory_hashtable = make_hashtable(0, hashtable_size, sizeof(hash_entry)); db->flush_after_n_words = 0x7FFFFFFF; /* a large number */ sig_init_add_word(db, BATCH_UPDATE, ADD_UPDATE); add_dictionary_to_hashtable(db); add_stop_words(db->the_word_memory_hashtable); return(0); }